import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3
def __iter__(self): return 0
# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
if os.environ.get('RUNTIME_ENV_LOCATION_TYPE') == 'external':
endpoint_4b15fa3394a14800b00716fbf214f282 = 'https://s3.us.cloud-object-storage.appdomain.cloud'
else:
endpoint_4b15fa3394a14800b00716fbf214f282 = 'https://s3.private.us.cloud-object-storage.appdomain.cloud'
client_4b15fa3394a14800b00716fbf214f282 = ibm_boto3.client(service_name='s3',
ibm_api_key_id='UK9E4LhblYhtCOzQ9wMS3cP0ALg0Ne_Z_neaRkm7s5zn',
ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
config=Config(signature_version='oauth'),
endpoint_url=endpoint_4b15fa3394a14800b00716fbf214f282)
body = client_4b15fa3394a14800b00716fbf214f282.get_object(Bucket='predictenginefailures-donotdelete-pr-dzldltk75qmkfh',Key='df_train.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
df_train = pd.read_csv(body)
body = client_4b15fa3394a14800b00716fbf214f282.get_object(Bucket='predictenginefailures-donotdelete-pr-dzldltk75qmkfh',Key='df_test.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
df_test = pd.read_csv(body)
body = client_4b15fa3394a14800b00716fbf214f282.get_object(Bucket='predictenginefailures-donotdelete-pr-dzldltk75qmkfh',Key='df_true.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
df_truth = pd.read_csv(body)
print('Shape:',df_train.shape)
print(df_train.info())
df_train.head()
print('Shape:',df_test.shape)
print(df_test.info())
df_test.head()
print('Shape:',df_truth.shape)
print(df_truth.info())
df_truth.head()
#Boxplots of sensor data
plt.figure(figsize = (16, 21))
for i in range(21):
temp_data = df_train.iloc[:,i+5]
plt.subplot(7,3,i+1)
plt.boxplot(temp_data)
plt.title("Sensor " + str(i+1) + ", column "+ str(i+6))
plt.show()
# comparision of train and test data
plt.figure(figsize = (15, 21))
for i,j in enumerate([6, 7, 8, 11, 12, 13, 15, 16, 17, 18, 19, 21, 24, 25]):
temp_train = df_train.iloc[:, j]
temp_test = df_test.iloc[:, j]
plt.subplot(7,3, i+1)
sns.kdeplot(temp_train, legend = False, color = "blue", label = "Train")
sns.kdeplot(temp_test, legend = False, color = "red", label = "Test")
plt.title("Sensor " + str(j-4) + ", column "+ str(j+1))
plt.legend()
plt.show()
#plt.style.use('dark_background')
plt.figure(figsize=(20,50))
ax = df_train.groupby('engine_id')['cycle'].max().plot(kind='barh',width=0.8,stacked = True, align = 'center', rot = 0)
plt.title("Engine Lifetime", fontweight = 'bold', size = 35)
plt.xlabel('Cycle Time', fontweight='bold',size=30)
plt.xticks(size=25)
plt.ylabel('Engine ID',fontweight='bold',size=30)
plt.yticks(size=25)
plt.grid(True)
plt.tight_layout(True)
plt.show()
df_train.engine_id.unique()
rul=pd.DataFrame(df_test.groupby('engine_id')['cycle'].max()).reset_index()
cols=['engine_id','max_cycles'] # maximum number of cycles performed by each engine
rul.columns=cols
print('Shape:',rul.shape)
rul
Now rul and truth data have same dimensions
# calculating remaining time for failure for each engine
df_truth['rtf']=df_truth['rem_cycles']+rul['max_cycles']
print('Shape:',df_truth.shape)
df_truth.head()
df_truth.drop('rem_cycles', axis=1, inplace=True)
df_test=df_test.merge(df_truth,on=['engine_id'],how='left')
df_test['ttf']=df_test['rtf'] - df_test['cycle']
df_test.drop('rtf', axis=1, inplace=True)
print('Shape:',df_test.shape)
df_test.head()
df_train['ttf'] = df_train.groupby(['engine_id'])['cycle'].transform(max)-df_train['cycle']
print('Shape:',df_train.shape)
df_train.head()
df_train.ttf.value_counts()
dfn_train=df_train.copy() # new train data
dfn_test=df_test.copy() # new test data
period=30 # threshold
dfn_train['label_bc'] = dfn_train['ttf'].apply(lambda x: 1 if x <= period else 0)
dfn_test['label_bc'] = dfn_test['ttf'].apply(lambda x: 1 if x <= period else 0)
print('For train data-')
print('min ttf:',dfn_train.ttf.min())
print('max ttf:',dfn_train.ttf.max())
print('mean ttf:',dfn_train.ttf.mean())
dfn_train.head()
dfn_train.label_bc.value_counts()
features_col_name=['setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11',
's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']
target_col_name='label_bc'
sc=MinMaxScaler()
dfn_train[features_col_name]=sc.fit_transform(dfn_train[features_col_name])
dfn_test[features_col_name]=sc.transform(dfn_test[features_col_name])
dfn_train.head()
sns.pairplot(dfn_train)
plt.plot()
# Plotting pairwise relationships in a dataset.